started: 2016
last updated: 19Jan2017

Notes

Source data includes
- the output generated by wes pipeline in nov2016
- table for samples info, including wes-gwas correspondence and pass/fail info about samples in wes
- several tables with phenotype data, obtained from DC at different times
- table with cases where pathogenic BRCA1, BRCA2 or PALB2 variants were detected

Importantly, exac and kgen data may not be correct (especially for the retained multiallelic sites) because GATK variant-to-table tool did not handle multiallelic sites correctly at the time of data generation.

start_section

# Time stamp
Sys.time()
[1] "2017-01-19 20:43:12 GMT"
# Folders
setwd("/scratch/medgen/scripts/wecare_stat_01.17/scripts")
source_data_folder <- "/scratch/medgen/scripts/wecare_stat_01.17/source_data"
interim_data_folder <- "/scratch/medgen/scripts/wecare_stat_01.17/interim_data"

copy_source_ngs_data

Done only once. Hence FALSE in copying.

src_folder <- "/scratch/medgen/users/alexey/wecare_3_nov2016/wecare_nfe_nov2016_vqsr_shf_sma_ann_txt"
tgt_folder <- source_data_folder
prefix="wecare_nfe_nov2016_vqsr_shf_sma_ann"
gt_file <- paste(prefix,"GT_add.txt",sep="_")
gq_file <- paste(prefix,"GQ.txt",sep="_")
dp_file <- paste(prefix,"DP.txt",sep="_")
vv_file <- paste(prefix,"VV.txt",sep="_")
exac_file <- paste(prefix,"exac.txt",sep="_")
kgen_file <- paste(prefix,"kgen.txt",sep="_")
file.copy(
  paste(src_folder, gt_file, sep="/"),
  paste(tgt_folder, gt_file, sep="/"))
[1] FALSE
file.copy(
  paste(src_folder, gq_file, sep="/"),
  paste(tgt_folder, gq_file, sep="/"))
[1] FALSE
file.copy(
  paste(src_folder, dp_file, sep="/"),
  paste(tgt_folder, dp_file, sep="/"))
[1] FALSE
file.copy(
  paste(src_folder, vv_file, sep="/"),
  paste(tgt_folder, vv_file, sep="/"))
[1] FALSE
file.copy(
  paste(src_folder, exac_file, sep="/"),
  paste(tgt_folder, exac_file, sep="/"))
[1] FALSE
file.copy(
  paste(src_folder, kgen_file, sep="/"),
  paste(tgt_folder, kgen_file, sep="/"))
[1] FALSE
# Clean-up
rm(src_folder, tgt_folder)

copy_source_phenotype_data

Done only once. Hence FALSE in copying.

Files with phenotypes updates (Dec2016) and cases with BRCA1, BRCA2 and PALB2 (Dec 2016) were added manually.

src_folder <- "/scratch/medgen/users/alexey/wecare_stat_2_aug2016/source_data"
tgt_folder <- source_data_folder
covar_file <- "covar.txt"
samples_file <- "samples_ids.txt"
demographics_file <- "WECARE.Exome.DemographicVariables.txt"
file.copy(
  paste(src_folder, covar_file, sep="/"),
  paste(tgt_folder, covar_file, sep="/"))
[1] FALSE
file.copy(
  paste(src_folder, samples_file, sep="/"),
  paste(tgt_folder, samples_file, sep="/"))
[1] FALSE
file.copy(
  paste(src_folder, demographics_file, sep="/"),
  paste(tgt_folder, demographics_file, sep="/"))
[1] FALSE
# Clean-up
rm(src_folder, tgt_folder)

read_data

gt.df <- read.table(
  paste(source_data_folder, gt_file, sep="/"), 
  header=TRUE, row.names=1, sep="\t", quote="")
gq.df <- read.table(
  paste(source_data_folder, gq_file, sep="/"), 
  header=TRUE, row.names=1, sep="\t", quote="")
dp.df <- read.table(
  paste(source_data_folder, dp_file, sep="/"), 
  header=TRUE, row.names=1, sep="\t", quote="")
covar.df <- read.table(
  paste(source_data_folder, covar_file, sep="/"), 
  sep="\t", header=TRUE, quote="")
samples.df <- read.table(
  paste(source_data_folder, samples_file, sep="/"), 
  sep="\t", header=TRUE, quote="")
demographics.df <- read.table(
  paste(source_data_folder, demographics_file, sep="/"), 
  sep="\t", header=TRUE, quote="")
vv.df <- read.table(
  paste(source_data_folder, vv_file, sep="/"), 
  header=TRUE, sep="\t", quote="")
kgen.df <- read.table(
  paste(source_data_folder, kgen_file, sep="/"), 
  header=TRUE, sep="\t", quote="")
exac.df <- read.table(
  paste(source_data_folder, exac_file, sep="/"), 
  header=TRUE, sep="\t", quote="")
phenotypes_update_file <- "phenotypes_update_06Dec2016.txt"
phenotypes_update.df <- read.table(
  paste(source_data_folder, phenotypes_update_file, sep="/"), 
  header=TRUE, sep="\t", quote="")
BRCA1_BRCA2_PALB2_cases_file <- "cases_with_BRCA1_BRCA2_PALB2_pathogenic_variants.txt"
BRCA1_BRCA2_PALB2_cases.df <- read.table(
  paste(source_data_folder, BRCA1_BRCA2_PALB2_cases_file, sep="/"), 
  header=TRUE, sep="\t", quote="")
# Update rownames, when necessary
rownames(vv.df) <- vv.df[,1]
rownames(kgen.df) <- kgen.df[,1]
rownames(exac.df) <- exac.df[,1]
rownames(BRCA1_BRCA2_PALB2_cases.df) <- BRCA1_BRCA2_PALB2_cases.df[,1]
# Update colnames, when necessary
colnames(gt.df) <- sub(".GT", "", colnames(gt.df))
colnames(gq.df) <- sub(".GQ", "", colnames(gq.df))
colnames(dp.df) <- sub(".DP", "", colnames(dp.df))
# Clean-up
rm(source_data_folder, prefix, covar_file, demographics_file, samples_file, vv_file, gt_file, gq_file, dp_file, kgen_file, exac_file, phenotypes_update_file, BRCA1_BRCA2_PALB2_cases_file)

check_data

dim(gt.df)
[1] 343824    710
str(gt.df, list.len=5)
'data.frame':   343824 obs. of  710 variables:
 $ HG00097        : int  NA 0 0 0 0 0 0 0 0 0 ...
 $ HG00099        : int  0 0 0 0 NA NA NA NA 0 0 ...
 $ HG00100        : int  0 NA 0 0 0 0 0 0 0 0 ...
 $ HG00102        : int  NA NA 0 0 0 0 0 0 0 0 ...
 $ HG00106        : int  NA 0 0 0 0 0 0 0 0 0 ...
  [list output truncated]
gt.df[1:5,1:5]
dim(gq.df)
[1] 343824    710
str(gq.df, list.len=5)
'data.frame':   343824 obs. of  710 variables:
 $ HG00097        : int  NA 99 36 36 36 36 36 36 36 99 ...
 $ HG00099        : int  9 99 99 99 NA NA NA NA 99 48 ...
 $ HG00100        : int  3 NA 21 18 9 9 9 75 57 18 ...
 $ HG00102        : int  NA NA 12 3 99 93 93 72 72 72 ...
 $ HG00106        : int  NA 58 21 21 33 33 33 33 33 33 ...
  [list output truncated]
gq.df[1:5,1:5]
dim(dp.df)
[1] 343824    710
str(dp.df, list.len=5)
'data.frame':   343824 obs. of  710 variables:
 $ HG00097        : int  0 74 13 13 28 28 28 28 28 37 ...
 $ HG00099        : int  4 169 35 35 NA NA NA NA 40 18 ...
 $ HG00100        : int  1 0 7 7 10 10 10 33 22 8 ...
 $ HG00102        : int  0 0 4 1 37 32 32 27 27 24 ...
 $ HG00106        : int  NA 130 8 8 20 20 20 20 20 20 ...
  [list output truncated]
dp.df[1:5,1:5]
dim(covar.df)
[1] 498  34
str(covar.df)
'data.frame':   498 obs. of  34 variables:
 $ Subject_ID      : int  200054 200491 200565 200698 200958 201046 201558 201921 202026 202236 ...
 $ setno           : int  382125 204356 360683 226881 357431 374980 201558 201921 213991 385058 ...
 $ cc              : int  0 0 0 0 0 0 1 1 0 0 ...
 $ chemo           : int  1 1 0 0 1 1 1 1 1 1 ...
 $ hormone         : int  0 1 1 0 1 0 0 1 1 0 ...
 $ chemo_hormone   : Factor w/ 5 levels "","both","chem",..: 3 2 4 5 2 3 3 2 2 3 ...
 $ chemo_self_mra  : int  1 1 0 0 1 1 1 1 1 1 ...
 $ hormone_self_mra: int  0 1 1 0 1 0 0 1 1 0 ...
 $ treatment       : int  1 1 1 0 1 1 1 1 1 1 ...
 $ ID              : int  2 6 7 9 11 12 16 22 24 26 ...
 $ labid           : Factor w/ 498 levels "id200054","id200491",..: 1 2 3 4 5 6 7 8 9 10 ...
 $ status          : int  0 0 0 0 0 0 1 1 0 0 ...
 $ status2         : int  0 0 0 0 0 0 1 1 0 0 ...
 $ offset          : num  6.41 5.82 5.61 4.03 4.77 ...
 $ sub_dx_age      : int  46 50 46 44 43 39 45 40 43 36 ...
 $ XRTBreast       : int  0 0 0 1 0 1 0 0 1 0 ...
 $ Eigen_1         : num  -0.0079 -0.01062 -0.00539 -0.00906 -0.01094 ...
 $ Eigen_2         : num  0.0055 0.00414 0.00302 0.00301 0.0023 ...
 $ Eigen_3         : num  -0.02171 0.01254 0.00368 -0.00655 -0.01389 ...
 $ Eigen_4         : num  0.00356 -0.00787 -0.01496 -0.01256 -0.00501 ...
 $ Eigen_5         : num  0.00135 -0.0291 0.00391 0.01357 -0.00526 ...
 $ dose            : Factor w/ 3 levels "ge 1Gy","ls 1Gy",..: 3 3 3 2 3 2 3 3 2 3 ...
 $ dsmiss          : int  0 0 0 0 1 0 0 0 0 0 ...
 $ good_location   : int  1 1 0 1 1 1 0 1 0 0 ...
 $ Deleterious     : int  0 0 0 0 0 0 0 0 0 0 ...
 $ registry        : Factor w/ 5 levels "IA","IR","LA",..: 5 2 2 4 4 3 2 1 1 5 ...
 $ race            : int  0 0 0 0 0 0 0 0 0 0 ...
 $ age_stratum1    : Factor w/ 5 levels "20to34","35to39",..: 4 4 4 3 3 2 3 3 3 2 ...
 $ dxyr_stratum    : int  2 2 3 1 1 2 1 2 3 2 ...
 $ CMF_Only        : int  1 0 0 0 0 1 0 0 1 1 ...
 $ family_history  : int  0 0 0 0 0 0 1 0 0 0 ...
 $ sub_dx_age_cat  : int  0 0 0 1 1 1 0 1 1 1 ...
 $ CMF             : Factor w/ 3 levels "CMF","Oth","no": 1 2 3 3 2 1 2 2 1 1 ...
 $ XRTBrCHAR       : int  0 0 0 1 0 1 0 0 1 0 ...
covar.df[1:5,1:5]
dim(samples.df)
[1] 512   4
str(samples.df)
'data.frame':   512 obs. of  4 variables:
 $ wes_id   : Factor w/ 512 levels "P1_A01","P1_A02",..: 1 2 3 4 5 6 7 8 9 10 ...
 $ gwas_id  : Factor w/ 510 levels "id200054","id200491",..: 14 405 315 264 67 121 326 251 281 141 ...
 $ merged_id: Factor w/ 512 levels "P1_A01_id203568",..: 1 2 3 4 5 6 7 8 9 10 ...
 $ filter   : Factor w/ 5 levels "duplicate","low_concordance",..: 5 5 5 5 5 5 5 5 5 5 ...
samples.df[1:5,]
dim(demographics.df)
[1] 505  91
str(demographics.df)
'data.frame':   505 obs. of  91 variables:
 $ Subject_ID            : Factor w/ 503 levels "200054","200491",..: 1 2 3 4 5 6 7 8 9 10 ...
 $ ID.x                  : num  2 6 7 9 11 12 NA NA 24 26 ...
 $ labid.x               : Factor w/ 498 levels "15582015","19212019",..: 6 7 8 9 10 11 NA NA 12 13 ...
 $ Eigen_1.x             : num  -0.0079 -0.01062 -0.00539 -0.00906 -0.01094 ...
 $ Eigen_2.x             : num  0.0055 0.00414 0.00302 0.00301 0.0023 ...
 $ Eigen_3.x             : num  -0.02171 0.01254 0.00368 -0.00655 -0.01389 ...
 $ Eigen_4.x             : num  0.00356 -0.00787 -0.01496 -0.01256 -0.00501 ...
 $ Eigen_5.x             : num  0.00135 -0.0291 0.00391 0.01357 -0.00526 ...
 $ Phase                 : num  1 1 1 1 1 1 NA NA 1 1 ...
 $ setno.x               : num  382125 204356 360683 226881 357431 ...
 $ cc.x                  : int  0 0 0 0 0 0 NA NA 0 0 ...
 $ rstime                : num  7.42 9.75 6.09 6.25 9.34 7 NA NA 6.09 8.66 ...
 $ registry.x            : Factor w/ 7 levels "","IA","IR","LA",..: 6 3 3 5 5 4 NA NA 2 6 ...
 $ race.x                : num  0 0 0 0 0 0 NA NA 0 0 ...
 $ offset.x              : num  6.41 5.82 5.61 4.03 4.77 ...
 $ DOB                   : num  -5049 -7459 -3916 -5890 -6407 ...
 $ sub_dx_age.x          : num  46 50 46 44 43 39 NA NA 43 36 ...
 $ refage                : num  53 59 51 50 52 45 NA NA 48 44 ...
 $ BMI_age18             : num  20.2 19.7 23.3 19.5 25.8 ...
 $ BMI_dx                : num  22.8 20.9 23.3 32.6 25.8 ...
 $ BMI_ref               : num  25.7 22 25.8 32.6 28.1 ...
 $ hormone_self_mra.x    : num  0 1 1 0 1 0 NA NA 1 0 ...
 $ chemo_self_mra.x      : num  1 1 0 0 1 1 NA NA 1 1 ...
 $ treatment.x           : num  1 1 1 0 1 1 NA NA 1 1 ...
 $ family_history.x      : Factor w/ 3 levels "1+","none","othe": 2 2 2 2 2 2 NA NA 2 2 ...
 $ rh_age_menarche       : num  13 14 13 13 12 9 NA NA 12 11 ...
 $ age_menopause_1yrbf_fd: num  -1 48 -1 -1 -1 -1 NA NA -1 -1 ...
 $ age_1fftp_fd          : num  20 22 0 29 28 0 NA NA 27 24 ...
 $ Num_ftp_fd            : num  1 2 -1 2 2 -1 NA NA 3 2 ...
 $ Histology             : Factor w/ 4 levels "lobular","medullar",..: 3 3 3 2 3 3 NA NA 3 3 ...
 $ Histology1            : Factor w/ 4 levels "lobular","medullar",..: 3 3 3 2 3 3 NA NA 3 3 ...
 $ Hist_lob_other        : Factor w/ 4 levels "lobular","other",..: 2 2 2 2 2 2 NA NA 2 2 ...
 $ stage_fd              : num  2 1 1 1 2 2 NA NA 2 2 ...
 $ er_fd                 : num  1 1 1 4 1 2 NA NA 2 2 ...
 $ pr_fd                 : num  1 1 1 2 1 2 NA NA 1 0 ...
 $ histo1_cat            : Factor w/ 9 levels "Tubular/mucinous",..: 9 4 4 6 4 4 NA NA 4 4 ...
 $ er1_cat               : Factor w/ 5 levels "negative","own unkn",..: 3 3 3 5 3 1 NA NA 1 1 ...
 $ pr1_cat               : Factor w/ 7 levels "negative","own 0 Ot",..: 3 3 3 1 3 1 NA NA 3 7 ...
 $ status.x              : num  0 0 0 0 0 0 NA NA 0 0 ...
 $ status2.x             : Factor w/ 4 levels "","0","1","h": 2 2 2 2 2 2 NA NA 2 2 ...
 $ sub_race              : num  0 0 0 0 0 0 NA NA 0 0 ...
 $ er1_num               : num  1 1 1 NA 1 0 NA NA 0 0 ...
 $ er1                   : int  1 1 1 NA 1 0 NA NA 0 0 ...
 $ horm_tmx              : num  0 1 1 0 2 0 NA NA 1 0 ...
 $ XRTBreast.x           : num  0 0 0 1 0 1 NA NA 1 0 ...
 $ dose_caseloc          : num  0 0 0 0.96 NA 0.88 NA NA 0.76 0 ...
 $ good_location.x       : num  1 1 0 1 1 1 NA NA 0 0 ...
 $ avedose               : num  0 0 0 1.65 NA 1.45 NA NA 0.91 0 ...
 $ tmx                   : num  0 1 1 0 0 0 NA NA 1 0 ...
 $ num_preg              : num  1 1 0 1 1 0 NA NA 1 1 ...
 $ fam_hist              : num  0 0 0 0 0 0 NA NA 0 0 ...
 $ age_menarche          : int  1 1 1 1 0 0 NA NA 0 0 ...
 $ lobular               : num  0 0 0 0 0 0 NA NA 0 0 ...
 $ age_menopause         : int  0 2 0 0 0 0 NA NA 0 0 ...
 $ conf_miss             : num  0 0 0 0 0 0 NA NA 0 0 ...
 $ dose_num              : num  0 0 0 1 NA 1 NA NA 1 0 ...
 $ cmf                   : Factor w/ 6 levels "","\024\x9c@",..: 3 4 5 5 3 3 NA NA 3 3 ...
 $ cmf_012               : num  1 2 0 0 1 1 NA NA 1 1 ...
 $ setno.y               : int  382125 204356 360683 226881 357431 374980 201558 201921 213991 385058 ...
 $ cc.y                  : int  0 0 0 0 0 0 1 1 0 0 ...
 $ chemo                 : int  1 1 0 0 1 1 1 1 1 1 ...
 $ hormone               : int  0 1 1 0 1 0 0 1 1 0 ...
 $ chemo_hormone         : Factor w/ 5 levels "","both","chem",..: 3 2 4 5 2 3 3 2 2 3 ...
 $ chemo_self_mra.y      : int  1 1 0 0 1 1 1 1 1 1 ...
 $ hormone_self_mra.y    : int  0 1 1 0 1 0 0 1 1 0 ...
 $ treatment.y           : int  1 1 1 0 1 1 1 1 1 1 ...
 $ ID.y                  : int  2 6 7 9 11 12 16 22 24 26 ...
 $ labid.y               : Factor w/ 498 levels "id200054","id200491",..: 1 2 3 4 5 6 7 8 9 10 ...
 $ status.y              : int  0 0 0 0 0 0 1 1 0 0 ...
 $ status2.y             : int  0 0 0 0 0 0 1 1 0 0 ...
 $ offset.y              : num  6.41 5.82 5.61 4.03 4.77 ...
 $ sub_dx_age.y          : int  46 50 46 44 43 39 45 40 43 36 ...
 $ XRTBreast.y           : int  0 0 0 1 0 1 0 0 1 0 ...
 $ Eigen_1.y             : num  -0.0079 -0.01062 -0.00539 -0.00906 -0.01094 ...
 $ Eigen_2.y             : num  0.0055 0.00414 0.00302 0.00301 0.0023 ...
 $ Eigen_3.y             : num  -0.02171 0.01254 0.00368 -0.00655 -0.01389 ...
 $ Eigen_4.y             : num  0.00356 -0.00787 -0.01496 -0.01256 -0.00501 ...
 $ Eigen_5.y             : num  0.00135 -0.0291 0.00391 0.01357 -0.00526 ...
 $ dose                  : Factor w/ 3 levels "ge 1Gy","ls 1Gy",..: 3 3 3 2 3 2 3 3 2 3 ...
 $ dsmiss                : int  0 0 0 0 1 0 0 0 0 0 ...
 $ good_location.y       : int  1 1 0 1 1 1 0 1 0 0 ...
 $ Deleterious           : int  0 0 0 0 0 0 0 0 0 0 ...
 $ registry.y            : Factor w/ 5 levels "IA","IR","LA",..: 5 2 2 4 4 3 2 1 1 5 ...
 $ race.y                : int  0 0 0 0 0 0 0 0 0 0 ...
 $ age_stratum1          : Factor w/ 5 levels "20to34","35to39",..: 4 4 4 3 3 2 3 3 3 2 ...
 $ dxyr_stratum          : int  2 2 3 1 1 2 1 2 3 2 ...
 $ CMF_Only              : int  1 0 0 0 0 1 0 0 1 1 ...
 $ family_history.y      : int  0 0 0 0 0 0 1 0 0 0 ...
 $ sub_dx_age_cat        : int  0 0 0 1 1 1 0 1 1 1 ...
 $ CMF                   : Factor w/ 3 levels "CMF","Oth","no": 1 2 3 3 2 1 2 2 1 1 ...
 $ XRTBrCHAR             : int  0 0 0 1 0 1 0 0 1 0 ...
demographics.df[1:5,1:5]
dim(phenotypes_update.df)
[1] 13 22
str(phenotypes_update.df)
'data.frame':   13 obs. of  22 variables:
 $ gwas_id       : Factor w/ 13 levels "id201558","id201921",..: 11 8 7 9 3 6 10 5 4 13 ...
 $ wes_id        : Factor w/ 13 levels "P1_C02","P1_C04",..: 1 2 3 4 5 6 7 8 9 10 ...
 $ cc            : int  0 0 0 0 1 0 1 1 0 0 ...
 $ setno         : int  227587 243637 332699 247333 204830 204830 310424 247333 398607 201558 ...
 $ family_history: int  1 1 1 0 1 0 1 1 0 0 ...
 $ age_dx        : int  30 30 31 30 24 23 31 33 49 40 ...
 $ age_ref       : int  39 33 32 34 28 27 32 37 54 47 ...
 $ rstime        : num  9.08 3.17 2.09 4.5 3.83 3.83 1.33 4.5 5.16 7.25 ...
 $ Eigen_1       : num  -0.01369 -0.00753 -0.00687 -0.00363 -0.01057 ...
 $ Eigen_2       : num  0.00615 0.00853 0.00138 0.00428 0.00317 ...
 $ Eigen_3       : num  -0.00474 -0.01256 0.00329 0.03749 -0.00716 ...
 $ Eigen_4       : num  -0.00596 0.00849 -0.01233 -0.02952 -0.00024 ...
 $ Eigen_5       : num  -0.01219 0.01074 -0.0138 -0.0202 -0.00147 ...
 $ stage         : int  1 1 2 2 2 1 2 1 1 1 ...
 $ er1           : Factor w/ 3 levels "0","1","missing": 1 2 1 2 1 2 2 3 2 3 ...
 $ pr1           : Factor w/ 3 levels "0","1","missing": 1 2 3 2 1 2 1 3 2 3 ...
 $ hist_cat      : Factor w/ 3 levels "ductal          ",..: 3 1 1 1 1 3 2 1 1 1 ...
 $ hormone       : Factor w/ 3 levels "0","1","missing": 1 1 1 1 1 1 1 1 3 1 ...
 $ chemo_cat     : Factor w/ 3 levels "CMF","Oth","no ": 1 3 2 3 2 2 1 2 2 2 ...
 $ br_xray       : int  1 0 0 0 0 1 0 1 0 1 ...
 $ br_xray_dose  : num  1.1 0 0 0 0 1.2 0 1.9 0 0.86 ...
 $ num_preg      : int  2 0 1 1 1 0 0 1 1 0 ...
phenotypes_update.df[1:5,1:5]
dim(BRCA1_BRCA2_PALB2_cases.df)
[1] 11 12
str(BRCA1_BRCA2_PALB2_cases.df)
'data.frame':   11 obs. of  12 variables:
 $ Cases_wes_id : Factor w/ 11 levels "P1_A09","P1_C02",..: 1 2 3 4 5 6 7 8 9 10 ...
 $ Cases_gwas_id: Factor w/ 11 levels "id204830","id247333",..: 8 9 3 1 6 2 4 11 5 7 ...
 $ SYMBOL       : Factor w/ 3 levels "BRCA1","BRCA2",..: 1 3 1 1 2 1 2 2 3 3 ...
 $ TYPE         : Factor w/ 2 levels "INDEL","SNP": 2 1 2 2 2 1 1 1 2 2 ...
 $ CHROM        : int  17 16 17 17 13 17 13 13 16 16 ...
 $ POS.GRCh37   : int  41256985 23641422 41243800 41258504 32936732 41276044 32914437 32914437 23625324 23634452 ...
 $ REF          : Factor w/ 6 levels "A","C","G","GT",..: 5 6 2 1 3 1 4 4 2 2 ...
 $ ALT          : Factor w/ 5 levels "A","ACT","C",..: 3 5 1 3 3 2 4 4 4 4 ...
 $ rs_id        : Factor w/ 9 levels "rs28897672","rs28897686",..: 7 5 2 1 8 6 9 9 4 3 ...
 $ Consequence  : Factor w/ 6 levels "frameshift_variant",..: 2 1 6 3 3 1 1 1 5 4 ...
 $ wes_var_id   : Factor w/ 9 levels "Var000223294",..: 7 5 6 8 2 9 1 1 3 4 ...
 $ Note         : Factor w/ 2 levels "","Added for QC": 1 1 2 2 1 2 1 1 1 1 ...
BRCA1_BRCA2_PALB2_cases.df[1:5,1:5]
dim(vv.df)
[1] 343824     35
str(vv.df)
'data.frame':   343824 obs. of  35 variables:
 $ SplitVarID         : Factor w/ 343824 levels "Var000000001",..: 1 2 3 4 5 6 7 8 9 10 ...
 $ TYPE               : Factor w/ 2 levels "INDEL","SNP": 1 2 2 2 2 2 2 2 2 2 ...
 $ ID                 : Factor w/ 253381 levels "rs10000804","rs10000924",..: NA 234676 201998 253336 54527 NA NA 162750 NA NA ...
 $ CHROM              : Factor w/ 25 levels "1","10","11",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ POS                : int  664486 762330 865628 865694 871159 871171 871173 871215 871230 871271 ...
 $ REF                : Factor w/ 2142 levels "A","AAAAAAAAAAAAAAAAAAAAAAAG",..: 1653 1044 1044 539 1044 1 539 539 539 539 ...
 $ ALT                : Factor w/ 1297 levels "A","AAAAAAAAAAAAACT",..: 989 989 1 989 1 664 1 664 989 989 ...
 $ QUAL               : num  35191 7081 6420 991 649 ...
 $ DP                 : int  49355 26720 14628 8357 11648 12903 13516 16282 15415 10748 ...
 $ AS_VQSLOD          : num  1.57 7.04 16.2 18.18 16.05 ...
 $ FILTER             : Factor w/ 1 level "PASS": 1 1 1 1 1 1 1 1 1 1 ...
 $ AC                 : int  53 8 7 4 1 1 1 5 1 1 ...
 $ AF                 : num  0.044 0.00622 0.00519 0.00318 0.00082 ...
 $ AN                 : int  1206 1286 1348 1258 1220 1250 1258 1312 1310 1294 ...
 $ NEGATIVE_TRAIN_SITE: logi  NA NA NA NA NA NA ...
 $ POSITIVE_TRAIN_SITE: logi  NA NA NA TRUE NA NA ...
 $ SYMBOL             : Factor w/ 19874 levels "A1BG","A1CF",..: 14360 9127 14922 14922 14922 14922 14922 14922 14922 14922 ...
 $ Allele             : Factor w/ 1048 levels "-","A","AA","AAA",..: 1 790 2 790 2 533 2 533 790 790 ...
 $ Existing_variation : Factor w/ 307403 levels "","1KG_2_227731951",..: 1 255726 203059 307355 55938 218187 1 163633 277286 1 ...
 $ Consequence        : Factor w/ 80 levels "3_prime_UTR_variant",..: 80 32 28 28 28 28 78 78 78 28 ...
 $ IMPACT             : Factor w/ 4 levels "HIGH","LOW","MODERATE",..: 4 4 3 3 3 3 2 2 2 3 ...
 $ CLIN_SIG           : Factor w/ 93 levels "association",..: NA NA NA NA NA NA NA NA NA NA ...
 $ cDNA_position      : Factor w/ 16588 levels "0-1","1","1-18",..: NA 11562 6108 7438 8899 9105 9146 9809 10047 10653 ...
 $ CDS_position       : Factor w/ 15333 levels "1","1-18","1-2",..: NA NA 3598 5147 6675 6879 6912 7603 7853 8506 ...
 $ Codons             : Factor w/ 3014 levels "-/A","-/AA","-/AAAA",..: NA NA 698 472 690 319 1167 1753 1848 1532 ...
 $ Protein_position   : Factor w/ 7359 levels "1","1-2","1-6",..: NA NA 5937 6741 125 216 216 514 620 910 ...
 $ Amino_acids        : Factor w/ 1851 levels "*","*/*EX","*/C",..: NA NA 589 695 589 1586 1583 1100 1329 1107 ...
 $ DISTANCE           : int  960 NA NA NA NA NA NA NA NA NA ...
 $ STRAND             : int  -1 -1 1 1 1 1 1 1 1 1 ...
 $ SYMBOL_SOURCE      : Factor w/ 6 levels "Clone_based_ensembl_gene",..: 2 3 3 3 3 3 3 3 3 3 ...
 $ SIFT_call          : Factor w/ 4 levels "deleterious",..: NA NA 2 1 3 1 NA NA NA 4 ...
 $ SIFT_score         : num  NA NA 0.01 0.04 1 0.05 NA NA NA 0.09 ...
 $ PolyPhen_call      : Factor w/ 4 levels "benign","possibly_damaging",..: NA NA 1 2 1 1 NA NA NA 1 ...
 $ PolyPhen_score     : num  NA NA 0.099 0.493 0.002 0.018 NA NA NA 0.045 ...
 $ Multiallelic       : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
vv.df[1:5,1:5]
dim(kgen.df)
[1] 343824      9
str(kgen.df)
'data.frame':   343824 obs. of  9 variables:
 $ SplitVarID : Factor w/ 343824 levels "Var000000001",..: 1 2 3 4 5 6 7 8 9 10 ...
 $ kgen.AC    : int  NA NA 14 263 2 1 NA NA NA NA ...
 $ kgen.AN    : int  NA NA 5008 5008 5008 5008 NA NA NA NA ...
 $ kgen.AF    : num  NA NA 0.002796 0.052516 0.000399 ...
 $ kgen.AFR_AF: num  NA NA 0 0.0227 0 0 NA NA NA NA ...
 $ kgen.AMR_AF: num  NA NA 0.0072 0.0965 0 0 NA NA NA NA ...
 $ kgen.EAS_AF: num  NA NA 0 0.139 0 ...
 $ kgen.EUR_AF: num  NA NA 0.005 0.002 0.002 0.001 NA NA NA NA ...
 $ kgen.SAS_AF: num  NA NA 0.0041 0.0245 0 0 NA NA NA NA ...
kgen.df[1:5,1:5]
dim(exac.df)
[1] 343824     48
str(exac.df)
'data.frame':   343824 obs. of  48 variables:
 $ SplitVarID             : Factor w/ 343824 levels "Var000000001",..: 1 2 3 4 5 6 7 8 9 10 ...
 $ exac_non_TCGA.AF       : num  NA 0.012 0.00278 0.025 0.00231 ...
 $ exac_non_TCGA.AC       : int  NA 175 295 2670 245 NA NA NA NA NA ...
 $ exac_non_TCGA.AN       : int  NA 14012 106020 106038 106210 NA NA NA NA NA ...
 $ exac_non_TCGA.AC_FEMALE: int  NA 59 94 1232 97 NA NA 1553 NA NA ...
 $ exac_non_TCGA.AN_FEMALE: int  NA 3452 18598 24670 45846 NA NA 45840 NA NA ...
 $ exac_non_TCGA.AC_MALE  : int  NA 91 170 1159 148 NA NA 1483 NA NA ...
 $ exac_non_TCGA.AN_MALE  : int  NA 8250 26380 33440 60312 NA NA 60304 NA NA ...
 $ exac_non_TCGA.AC_Adj   : int  NA 150 264 2391 245 NA NA 3036 NA NA ...
 $ exac_non_TCGA.AN_Adj   : int  NA 11702 44978 58110 106158 NA NA 106144 NA NA ...
 $ exac_non_TCGA.AC_Hom   : int  NA 0 1 102 2 NA NA 136 NA NA ...
 $ exac_non_TCGA.AC_Het   : int  NA 150 262 2187 241 NA NA 2760 NA NA ...
 $ exac_non_TCGA.AC_Hemi  : int  NA NA NA NA NA NA NA NA NA NA ...
 $ exac_non_TCGA.AC_AFR   : int  NA 85 6 172 0 NA NA 263 NA NA ...
 $ exac_non_TCGA.AN_AFR   : int  NA 422 4194 5062 9054 NA NA 9040 NA NA ...
 $ exac_non_TCGA.Hom_AFR  : int  NA 0 0 2 0 NA NA 6 NA NA ...
 $ exac_non_TCGA.Het_AFR  : int  NA 85 6 168 0 NA NA 251 NA NA ...
 $ exac_non_TCGA.Hemi_AFR : int  NA NA NA NA NA NA NA NA NA NA ...
 $ exac_non_TCGA.AC_AMR   : int  NA 2 20 979 0 NA NA 1180 NA NA ...
 $ exac_non_TCGA.AN_AMR   : int  NA 224 3570 6030 11216 NA NA 11210 NA NA ...
 $ exac_non_TCGA.Hom_AMR  : int  NA 0 0 46 0 NA NA 67 NA NA ...
 $ exac_non_TCGA.Het_AMR  : int  NA 2 20 887 0 NA NA 1046 NA NA ...
 $ exac_non_TCGA.Hemi_AMR : int  NA NA NA NA NA NA NA NA NA NA ...
 $ exac_non_TCGA.AC_EAS   : int  NA 0 0 792 0 NA NA 889 NA NA ...
 $ exac_non_TCGA.AN_EAS   : int  NA 314 3432 4930 7866 NA NA 7854 NA NA ...
 $ exac_non_TCGA.Hom_EAS  : int  NA 0 0 47 0 NA NA 52 NA NA ...
 $ exac_non_TCGA.Het_EAS  : int  NA 0 0 698 0 NA NA 785 NA NA ...
 $ exac_non_TCGA.Hemi_EAS : int  NA NA NA NA NA NA NA NA NA NA ...
 $ exac_non_TCGA.AC_FIN   : int  NA 0 9 4 149 NA NA 39 NA NA ...
 $ exac_non_TCGA.AN_FIN   : int  NA 30 1376 1960 6614 NA NA 6614 NA NA ...
 $ exac_non_TCGA.Hom_FIN  : int  NA 0 0 0 2 NA NA 0 NA NA ...
 $ exac_non_TCGA.Het_FIN  : int  NA 0 9 4 145 NA NA 39 NA NA ...
 $ exac_non_TCGA.Hemi_FIN : int  NA NA NA NA NA NA NA NA NA NA ...
 $ exac_non_TCGA.AC_NFE   : int  NA 50 193 61 92 NA NA 172 NA NA ...
 $ exac_non_TCGA.AN_NFE   : int  NA 2944 22156 28886 54312 NA NA 54328 NA NA ...
 $ exac_non_TCGA.Hom_NFE  : int  NA 0 1 1 0 NA NA 2 NA NA ...
 $ exac_non_TCGA.Het_NFE  : int  NA 50 191 59 92 NA NA 168 NA NA ...
 $ exac_non_TCGA.Hemi_NFE : int  NA NA NA NA NA NA NA NA NA NA ...
 $ exac_non_TCGA.AC_SAS   : int  NA 10 33 373 1 NA NA 478 NA NA ...
 $ exac_non_TCGA.AN_SAS   : int  NA 7650 9972 10884 16402 NA NA 16404 NA NA ...
 $ exac_non_TCGA.Hom_SAS  : int  NA 0 0 6 0 NA NA 9 NA NA ...
 $ exac_non_TCGA.Het_SAS  : int  NA 10 33 361 1 NA NA 456 NA NA ...
 $ exac_non_TCGA.Hemi_SAS : int  NA NA NA NA NA NA NA NA NA NA ...
 $ exac_non_TCGA.AC_OTH   : int  NA 3 3 10 3 NA NA 15 NA NA ...
 $ exac_non_TCGA.AN_OTH   : int  NA 118 278 358 694 NA NA 694 NA NA ...
 $ exac_non_TCGA.Hom_OTH  : int  NA 0 0 0 0 NA NA 0 NA NA ...
 $ exac_non_TCGA.Het_OTH  : int  NA 3 3 10 3 NA NA 15 NA NA ...
 $ exac_non_TCGA.Hemi_OTH : int  NA NA NA NA NA NA NA NA NA NA ...
exac.df[1:5,1:5]

convert_data_frames_to_matrices

gt.mx <- as.matrix(gt.df)
gq.mx <- as.matrix(gq.df)
dp.mx <- as.matrix(dp.df)
dim(gt.mx)
[1] 343824    710
class(gt.mx)
[1] "matrix"
gt.mx[1:5,1:5]
             HG00097 HG00099 HG00100 HG00102 HG00106
Var000000001      NA       0       0      NA      NA
Var000000002       0       0      NA      NA       0
Var000000003       0       0       0       0       0
Var000000004       0       0       0       0       0
Var000000005       0      NA       0       0       0
dim(gq.mx)
[1] 343824    710
class(gq.mx)
[1] "matrix"
gq.mx[1:5,1:5]
             HG00097 HG00099 HG00100 HG00102 HG00106
Var000000001      NA       9       3      NA      NA
Var000000002      99      99      NA      NA      58
Var000000003      36      99      21      12      21
Var000000004      36      99      18       3      21
Var000000005      36      NA       9      99      33
dim(dp.mx)
[1] 343824    710
class(dp.mx)
[1] "matrix"
dp.mx[1:5,1:5]
             HG00097 HG00099 HG00100 HG00102 HG00106
Var000000001       0       4       1       0      NA
Var000000002      74     169       0       0     130
Var000000003      13      35       7       4       8
Var000000004      13      35       7       1       8
Var000000005      28      NA      10      37      20
rm(gt.df, gq.df, dp.df)

check_consistence_of_rownames_and_colnames

# rownames
sum(rownames(gt.mx) != rownames(gq.mx))
[1] 0
sum(rownames(gt.mx) != rownames(dp.mx))
[1] 0
sum(rownames(gt.mx) != rownames(vv.df))
[1] 0
sum(rownames(gt.mx) != rownames(kgen.df))
[1] 0
sum(rownames(gt.mx) != rownames(exac.df))
[1] 0
# colnames
sum(colnames(gt.mx) != colnames(gq.mx))
[1] 0
sum(colnames(gt.mx) != colnames(dp.mx))
[1] 0

convert_blanks_to_NAs

NA -> vv.df[vv.df$Existing_variation == "", "Existing_variation"] # no blanks in other fields
NA -> covar.df[covar.df == ""] # 1 case in chemo_hormone (row 84)
NA -> demographics.df[demographics.df == ""] # 2 cases in registry and one in chemo_hormone
NA -> BRCA1_BRCA2_PALB2_cases.df[BRCA1_BRCA2_PALB2_cases.df == ""] # 8 cases in notes
# No blanks in other tables

reformat_phenotypes_update

# Converst factors to vectors (to symplify comparisons)
str(phenotypes_update.df)
'data.frame':   13 obs. of  22 variables:
 $ gwas_id       : Factor w/ 13 levels "id201558","id201921",..: 11 8 7 9 3 6 10 5 4 13 ...
 $ wes_id        : Factor w/ 13 levels "P1_C02","P1_C04",..: 1 2 3 4 5 6 7 8 9 10 ...
 $ cc            : int  0 0 0 0 1 0 1 1 0 0 ...
 $ setno         : int  227587 243637 332699 247333 204830 204830 310424 247333 398607 201558 ...
 $ family_history: int  1 1 1 0 1 0 1 1 0 0 ...
 $ age_dx        : int  30 30 31 30 24 23 31 33 49 40 ...
 $ age_ref       : int  39 33 32 34 28 27 32 37 54 47 ...
 $ rstime        : num  9.08 3.17 2.09 4.5 3.83 3.83 1.33 4.5 5.16 7.25 ...
 $ Eigen_1       : num  -0.01369 -0.00753 -0.00687 -0.00363 -0.01057 ...
 $ Eigen_2       : num  0.00615 0.00853 0.00138 0.00428 0.00317 ...
 $ Eigen_3       : num  -0.00474 -0.01256 0.00329 0.03749 -0.00716 ...
 $ Eigen_4       : num  -0.00596 0.00849 -0.01233 -0.02952 -0.00024 ...
 $ Eigen_5       : num  -0.01219 0.01074 -0.0138 -0.0202 -0.00147 ...
 $ stage         : int  1 1 2 2 2 1 2 1 1 1 ...
 $ er1           : Factor w/ 3 levels "0","1","missing": 1 2 1 2 1 2 2 3 2 3 ...
 $ pr1           : Factor w/ 3 levels "0","1","missing": 1 2 3 2 1 2 1 3 2 3 ...
 $ hist_cat      : Factor w/ 3 levels "ductal          ",..: 3 1 1 1 1 3 2 1 1 1 ...
 $ hormone       : Factor w/ 3 levels "0","1","missing": 1 1 1 1 1 1 1 1 3 1 ...
 $ chemo_cat     : Factor w/ 3 levels "CMF","Oth","no ": 1 3 2 3 2 2 1 2 2 2 ...
 $ br_xray       : int  1 0 0 0 0 1 0 1 0 1 ...
 $ br_xray_dose  : num  1.1 0 0 0 0 1.2 0 1.9 0 0.86 ...
 $ num_preg      : int  2 0 1 1 1 0 0 1 1 0 ...
as.character(phenotypes_update.df$gwas_id) -> phenotypes_update.df$gwas_id
as.character(phenotypes_update.df$wes_id) -> phenotypes_update.df$wes_id
trimws(as.character(phenotypes_update.df$hist_cat)) -> phenotypes_update.df$hist_cat
trimws(as.character(phenotypes_update.df$chemo_cat)) -> phenotypes_update.df$chemo_cat
as.character(phenotypes_update.df$er1) -> phenotypes_update.df$er1
as.character(phenotypes_update.df$pr1) -> phenotypes_update.df$pr1
as.character(phenotypes_update.df$hormone) -> phenotypes_update.df$hormone
NA -> phenotypes_update.df[phenotypes_update.df$er1 == "missing", "er1"]
NA -> phenotypes_update.df[phenotypes_update.df$pr1 == "missing", "pr1"]
NA -> phenotypes_update.df[phenotypes_update.df$hormone == "missing", "hormone"]
as.numeric(phenotypes_update.df$er1) -> phenotypes_update.df$er1
as.numeric(phenotypes_update.df$pr1) -> phenotypes_update.df$pr1
as.numeric(phenotypes_update.df$hormone) -> phenotypes_update.df$hormone
str(phenotypes_update.df)
'data.frame':   13 obs. of  22 variables:
 $ gwas_id       : chr  "id319270" "id298378" "id271981" "id301570" ...
 $ wes_id        : chr  "P1_C02" "P1_C04" "P1_C06" "P1_C11" ...
 $ cc            : int  0 0 0 0 1 0 1 1 0 0 ...
 $ setno         : int  227587 243637 332699 247333 204830 204830 310424 247333 398607 201558 ...
 $ family_history: int  1 1 1 0 1 0 1 1 0 0 ...
 $ age_dx        : int  30 30 31 30 24 23 31 33 49 40 ...
 $ age_ref       : int  39 33 32 34 28 27 32 37 54 47 ...
 $ rstime        : num  9.08 3.17 2.09 4.5 3.83 3.83 1.33 4.5 5.16 7.25 ...
 $ Eigen_1       : num  -0.01369 -0.00753 -0.00687 -0.00363 -0.01057 ...
 $ Eigen_2       : num  0.00615 0.00853 0.00138 0.00428 0.00317 ...
 $ Eigen_3       : num  -0.00474 -0.01256 0.00329 0.03749 -0.00716 ...
 $ Eigen_4       : num  -0.00596 0.00849 -0.01233 -0.02952 -0.00024 ...
 $ Eigen_5       : num  -0.01219 0.01074 -0.0138 -0.0202 -0.00147 ...
 $ stage         : int  1 1 2 2 2 1 2 1 1 1 ...
 $ er1           : num  0 1 0 1 0 1 1 NA 1 NA ...
 $ pr1           : num  0 1 NA 1 0 1 0 NA 1 NA ...
 $ hist_cat      : chr  "other carcinoma" "ductal" "ductal" "ductal" ...
 $ hormone       : num  0 0 0 0 0 0 0 0 NA 0 ...
 $ chemo_cat     : chr  "CMF" "no" "Oth" "no" ...
 $ br_xray       : int  1 0 0 0 0 1 0 1 0 1 ...
 $ br_xray_dose  : num  1.1 0 0 0 0 1.2 0 1.9 0 0.86 ...
 $ num_preg      : int  2 0 1 1 1 0 0 1 1 0 ...

save_data

save.image(paste(interim_data_folder, "r01_read_and_clean_data_jan2017.RData", sep="/"))

final_section

ls()
 [1] "BRCA1_BRCA2_PALB2_cases.df" "covar.df"                   "demographics.df"            "dp.mx"                     
 [5] "exac.df"                    "gq.mx"                      "gt.mx"                      "interim_data_folder"       
 [9] "kgen.df"                    "phenotypes_update.df"       "samples.df"                 "vv.df"                     
sessionInfo()
R version 3.2.3 (2015-12-10)
Platform: x86_64-redhat-linux-gnu (64-bit)
Running under: Scientific Linux 7.2 (Nitrogen)

locale:
 [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C         LC_TIME=C            LC_COLLATE=C         LC_MONETARY=C       
 [6] LC_MESSAGES=C        LC_PAPER=C           LC_NAME=C            LC_ADDRESS=C         LC_TELEPHONE=C      
[11] LC_MEASUREMENT=C     LC_IDENTIFICATION=C 

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

loaded via a namespace (and not attached):
 [1] backports_1.0.4 magrittr_1.5    rprojroot_1.1   htmltools_0.3.5 tools_3.2.3     base64enc_0.1-3 yaml_2.1.14    
 [8] Rcpp_0.12.8     stringi_1.1.2   rmarkdown_1.3   knitr_1.15.1    jsonlite_1.2    stringr_1.1.0   digest_0.6.11  
[15] evaluate_0.10  
Sys.time()
[1] "2017-01-19 20:51:50 GMT"
LS0tCnRpdGxlOiAicmVhZF9kYXRhX3dlY2FyZV9qYW4yMDE3IgpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sKLS0tCgpzdGFydGVkOiAyMDE2ICAKbGFzdCB1cGRhdGVkOiAxOUphbjIwMTcKCiMgTm90ZXMKClNvdXJjZSBkYXRhIGluY2x1ZGVzICAKLSB0aGUgb3V0cHV0IGdlbmVyYXRlZCBieSB3ZXMgcGlwZWxpbmUgaW4gbm92MjAxNiAgCi0gdGFibGUgZm9yIHNhbXBsZXMgaW5mbywgaW5jbHVkaW5nIHdlcy1nd2FzIGNvcnJlc3BvbmRlbmNlIGFuZCBwYXNzL2ZhaWwgaW5mbyBhYm91dCBzYW1wbGVzIGluIHdlcyAgCi0gc2V2ZXJhbCB0YWJsZXMgd2l0aCBwaGVub3R5cGUgZGF0YSwgb2J0YWluZWQgZnJvbSBEQyBhdCBkaWZmZXJlbnQgdGltZXMgIAotIHRhYmxlIHdpdGggY2FzZXMgd2hlcmUgcGF0aG9nZW5pYyBCUkNBMSwgQlJDQTIgb3IgUEFMQjIgdmFyaWFudHMgd2VyZSBkZXRlY3RlZAoKSW1wb3J0YW50bHksIGV4YWMgYW5kIGtnZW4gZGF0YSBtYXkgbm90IGJlIGNvcnJlY3QgKGVzcGVjaWFsbHkgZm9yIHRoZSByZXRhaW5lZCBtdWx0aWFsbGVsaWMgc2l0ZXMpIApiZWNhdXNlIEdBVEsgdmFyaWFudC10by10YWJsZSB0b29sIGRpZCBub3QgaGFuZGxlIG11bHRpYWxsZWxpYyBzaXRlcyBjb3JyZWN0bHkgYXQgdGhlIHRpbWUgb2YgZGF0YSBnZW5lcmF0aW9uLiAKCiMgc3RhcnRfc2VjdGlvbgoKYGBge3Igc3RhcnRfc2VjdGlvbn0KCiMgVGltZSBzdGFtcApTeXMudGltZSgpCgojIEZvbGRlcnMKc2V0d2QoIi9zY3JhdGNoL21lZGdlbi9zY3JpcHRzL3dlY2FyZV9zdGF0XzAxLjE3L3NjcmlwdHMiKQpzb3VyY2VfZGF0YV9mb2xkZXIgPC0gIi9zY3JhdGNoL21lZGdlbi9zY3JpcHRzL3dlY2FyZV9zdGF0XzAxLjE3L3NvdXJjZV9kYXRhIgppbnRlcmltX2RhdGFfZm9sZGVyIDwtICIvc2NyYXRjaC9tZWRnZW4vc2NyaXB0cy93ZWNhcmVfc3RhdF8wMS4xNy9pbnRlcmltX2RhdGEiCgpgYGAKCiMgY29weV9zb3VyY2VfbmdzX2RhdGEKCkRvbmUgb25seSBvbmNlLiBIZW5jZSBGQUxTRSBpbiBjb3B5aW5nLgoKYGBge3IgY29weV9zb3VyY2VfbmdzX2RhdGF9CgpzcmNfZm9sZGVyIDwtICIvc2NyYXRjaC9tZWRnZW4vdXNlcnMvYWxleGV5L3dlY2FyZV8zX25vdjIwMTYvd2VjYXJlX25mZV9ub3YyMDE2X3Zxc3Jfc2hmX3NtYV9hbm5fdHh0Igp0Z3RfZm9sZGVyIDwtIHNvdXJjZV9kYXRhX2ZvbGRlcgoKcHJlZml4PSJ3ZWNhcmVfbmZlX25vdjIwMTZfdnFzcl9zaGZfc21hX2FubiIKCmd0X2ZpbGUgPC0gcGFzdGUocHJlZml4LCJHVF9hZGQudHh0IixzZXA9Il8iKQpncV9maWxlIDwtIHBhc3RlKHByZWZpeCwiR1EudHh0IixzZXA9Il8iKQpkcF9maWxlIDwtIHBhc3RlKHByZWZpeCwiRFAudHh0IixzZXA9Il8iKQp2dl9maWxlIDwtIHBhc3RlKHByZWZpeCwiVlYudHh0IixzZXA9Il8iKQpleGFjX2ZpbGUgPC0gcGFzdGUocHJlZml4LCJleGFjLnR4dCIsc2VwPSJfIikKa2dlbl9maWxlIDwtIHBhc3RlKHByZWZpeCwia2dlbi50eHQiLHNlcD0iXyIpCgpmaWxlLmNvcHkoCiAgcGFzdGUoc3JjX2ZvbGRlciwgZ3RfZmlsZSwgc2VwPSIvIiksCiAgcGFzdGUodGd0X2ZvbGRlciwgZ3RfZmlsZSwgc2VwPSIvIikpCgpmaWxlLmNvcHkoCiAgcGFzdGUoc3JjX2ZvbGRlciwgZ3FfZmlsZSwgc2VwPSIvIiksCiAgcGFzdGUodGd0X2ZvbGRlciwgZ3FfZmlsZSwgc2VwPSIvIikpCgpmaWxlLmNvcHkoCiAgcGFzdGUoc3JjX2ZvbGRlciwgZHBfZmlsZSwgc2VwPSIvIiksCiAgcGFzdGUodGd0X2ZvbGRlciwgZHBfZmlsZSwgc2VwPSIvIikpCgpmaWxlLmNvcHkoCiAgcGFzdGUoc3JjX2ZvbGRlciwgdnZfZmlsZSwgc2VwPSIvIiksCiAgcGFzdGUodGd0X2ZvbGRlciwgdnZfZmlsZSwgc2VwPSIvIikpCgpmaWxlLmNvcHkoCiAgcGFzdGUoc3JjX2ZvbGRlciwgZXhhY19maWxlLCBzZXA9Ii8iKSwKICBwYXN0ZSh0Z3RfZm9sZGVyLCBleGFjX2ZpbGUsIHNlcD0iLyIpKQoKZmlsZS5jb3B5KAogIHBhc3RlKHNyY19mb2xkZXIsIGtnZW5fZmlsZSwgc2VwPSIvIiksCiAgcGFzdGUodGd0X2ZvbGRlciwga2dlbl9maWxlLCBzZXA9Ii8iKSkKCiMgQ2xlYW4tdXAKcm0oc3JjX2ZvbGRlciwgdGd0X2ZvbGRlcikKCmBgYAoKIyBjb3B5X3NvdXJjZV9waGVub3R5cGVfZGF0YQoKRG9uZSBvbmx5IG9uY2UuIEhlbmNlIEZBTFNFIGluIGNvcHlpbmcuCgpGaWxlcyB3aXRoIHBoZW5vdHlwZXMgdXBkYXRlcyAoRGVjMjAxNikgYW5kIGNhc2VzIHdpdGggQlJDQTEsIEJSQ0EyIGFuZCBQQUxCMiAoRGVjIDIwMTYpIHdlcmUgYWRkZWQgbWFudWFsbHkuCgpgYGB7ciBjb3B5X3NvdXJjZV9waGVub3R5cGVfZGF0YX0KCnNyY19mb2xkZXIgPC0gIi9zY3JhdGNoL21lZGdlbi91c2Vycy9hbGV4ZXkvd2VjYXJlX3N0YXRfMl9hdWcyMDE2L3NvdXJjZV9kYXRhIgp0Z3RfZm9sZGVyIDwtIHNvdXJjZV9kYXRhX2ZvbGRlcgoKY292YXJfZmlsZSA8LSAiY292YXIudHh0IgpzYW1wbGVzX2ZpbGUgPC0gInNhbXBsZXNfaWRzLnR4dCIKZGVtb2dyYXBoaWNzX2ZpbGUgPC0gIldFQ0FSRS5FeG9tZS5EZW1vZ3JhcGhpY1ZhcmlhYmxlcy50eHQiCgpmaWxlLmNvcHkoCiAgcGFzdGUoc3JjX2ZvbGRlciwgY292YXJfZmlsZSwgc2VwPSIvIiksCiAgcGFzdGUodGd0X2ZvbGRlciwgY292YXJfZmlsZSwgc2VwPSIvIikpCgpmaWxlLmNvcHkoCiAgcGFzdGUoc3JjX2ZvbGRlciwgc2FtcGxlc19maWxlLCBzZXA9Ii8iKSwKICBwYXN0ZSh0Z3RfZm9sZGVyLCBzYW1wbGVzX2ZpbGUsIHNlcD0iLyIpKQoKZmlsZS5jb3B5KAogIHBhc3RlKHNyY19mb2xkZXIsIGRlbW9ncmFwaGljc19maWxlLCBzZXA9Ii8iKSwKICBwYXN0ZSh0Z3RfZm9sZGVyLCBkZW1vZ3JhcGhpY3NfZmlsZSwgc2VwPSIvIikpCgojIENsZWFuLXVwCnJtKHNyY19mb2xkZXIsIHRndF9mb2xkZXIpCgpgYGAKCiMgcmVhZF9kYXRhCgpgYGB7ciByZWFkX2RhdGF9CgpndC5kZiA8LSByZWFkLnRhYmxlKAogIHBhc3RlKHNvdXJjZV9kYXRhX2ZvbGRlciwgZ3RfZmlsZSwgc2VwPSIvIiksIAogIGhlYWRlcj1UUlVFLCByb3cubmFtZXM9MSwgc2VwPSJcdCIsIHF1b3RlPSIiKQoKZ3EuZGYgPC0gcmVhZC50YWJsZSgKICBwYXN0ZShzb3VyY2VfZGF0YV9mb2xkZXIsIGdxX2ZpbGUsIHNlcD0iLyIpLCAKICBoZWFkZXI9VFJVRSwgcm93Lm5hbWVzPTEsIHNlcD0iXHQiLCBxdW90ZT0iIikKCmRwLmRmIDwtIHJlYWQudGFibGUoCiAgcGFzdGUoc291cmNlX2RhdGFfZm9sZGVyLCBkcF9maWxlLCBzZXA9Ii8iKSwgCiAgaGVhZGVyPVRSVUUsIHJvdy5uYW1lcz0xLCBzZXA9Ilx0IiwgcXVvdGU9IiIpCgpjb3Zhci5kZiA8LSByZWFkLnRhYmxlKAogIHBhc3RlKHNvdXJjZV9kYXRhX2ZvbGRlciwgY292YXJfZmlsZSwgc2VwPSIvIiksIAogIHNlcD0iXHQiLCBoZWFkZXI9VFJVRSwgcXVvdGU9IiIpCgpzYW1wbGVzLmRmIDwtIHJlYWQudGFibGUoCiAgcGFzdGUoc291cmNlX2RhdGFfZm9sZGVyLCBzYW1wbGVzX2ZpbGUsIHNlcD0iLyIpLCAKICBzZXA9Ilx0IiwgaGVhZGVyPVRSVUUsIHF1b3RlPSIiKQoKZGVtb2dyYXBoaWNzLmRmIDwtIHJlYWQudGFibGUoCiAgcGFzdGUoc291cmNlX2RhdGFfZm9sZGVyLCBkZW1vZ3JhcGhpY3NfZmlsZSwgc2VwPSIvIiksIAogIHNlcD0iXHQiLCBoZWFkZXI9VFJVRSwgcXVvdGU9IiIpCgp2di5kZiA8LSByZWFkLnRhYmxlKAogIHBhc3RlKHNvdXJjZV9kYXRhX2ZvbGRlciwgdnZfZmlsZSwgc2VwPSIvIiksIAogIGhlYWRlcj1UUlVFLCBzZXA9Ilx0IiwgcXVvdGU9IiIpCgprZ2VuLmRmIDwtIHJlYWQudGFibGUoCiAgcGFzdGUoc291cmNlX2RhdGFfZm9sZGVyLCBrZ2VuX2ZpbGUsIHNlcD0iLyIpLCAKICBoZWFkZXI9VFJVRSwgc2VwPSJcdCIsIHF1b3RlPSIiKQoKZXhhYy5kZiA8LSByZWFkLnRhYmxlKAogIHBhc3RlKHNvdXJjZV9kYXRhX2ZvbGRlciwgZXhhY19maWxlLCBzZXA9Ii8iKSwgCiAgaGVhZGVyPVRSVUUsIHNlcD0iXHQiLCBxdW90ZT0iIikKCnBoZW5vdHlwZXNfdXBkYXRlX2ZpbGUgPC0gInBoZW5vdHlwZXNfdXBkYXRlXzA2RGVjMjAxNi50eHQiCnBoZW5vdHlwZXNfdXBkYXRlLmRmIDwtIHJlYWQudGFibGUoCiAgcGFzdGUoc291cmNlX2RhdGFfZm9sZGVyLCBwaGVub3R5cGVzX3VwZGF0ZV9maWxlLCBzZXA9Ii8iKSwgCiAgaGVhZGVyPVRSVUUsIHNlcD0iXHQiLCBxdW90ZT0iIikKCkJSQ0ExX0JSQ0EyX1BBTEIyX2Nhc2VzX2ZpbGUgPC0gImNhc2VzX3dpdGhfQlJDQTFfQlJDQTJfUEFMQjJfcGF0aG9nZW5pY192YXJpYW50cy50eHQiCkJSQ0ExX0JSQ0EyX1BBTEIyX2Nhc2VzLmRmIDwtIHJlYWQudGFibGUoCiAgcGFzdGUoc291cmNlX2RhdGFfZm9sZGVyLCBCUkNBMV9CUkNBMl9QQUxCMl9jYXNlc19maWxlLCBzZXA9Ii8iKSwgCiAgaGVhZGVyPVRSVUUsIHNlcD0iXHQiLCBxdW90ZT0iIikKCiMgVXBkYXRlIHJvd25hbWVzLCB3aGVuIG5lY2Vzc2FyeQpyb3duYW1lcyh2di5kZikgPC0gdnYuZGZbLDFdCnJvd25hbWVzKGtnZW4uZGYpIDwtIGtnZW4uZGZbLDFdCnJvd25hbWVzKGV4YWMuZGYpIDwtIGV4YWMuZGZbLDFdCnJvd25hbWVzKEJSQ0ExX0JSQ0EyX1BBTEIyX2Nhc2VzLmRmKSA8LSBCUkNBMV9CUkNBMl9QQUxCMl9jYXNlcy5kZlssMV0KCiMgVXBkYXRlIGNvbG5hbWVzLCB3aGVuIG5lY2Vzc2FyeQpjb2xuYW1lcyhndC5kZikgPC0gc3ViKCIuR1QiLCAiIiwgY29sbmFtZXMoZ3QuZGYpKQpjb2xuYW1lcyhncS5kZikgPC0gc3ViKCIuR1EiLCAiIiwgY29sbmFtZXMoZ3EuZGYpKQpjb2xuYW1lcyhkcC5kZikgPC0gc3ViKCIuRFAiLCAiIiwgY29sbmFtZXMoZHAuZGYpKQoKIyBDbGVhbi11cApybShzb3VyY2VfZGF0YV9mb2xkZXIsIHByZWZpeCwgY292YXJfZmlsZSwgZGVtb2dyYXBoaWNzX2ZpbGUsIHNhbXBsZXNfZmlsZSwgdnZfZmlsZSwgZ3RfZmlsZSwgZ3FfZmlsZSwgZHBfZmlsZSwga2dlbl9maWxlLCBleGFjX2ZpbGUsIHBoZW5vdHlwZXNfdXBkYXRlX2ZpbGUsIEJSQ0ExX0JSQ0EyX1BBTEIyX2Nhc2VzX2ZpbGUpCgpgYGAKCiMgY2hlY2tfZGF0YQoKYGBge3IgY2hlY2tfZGF0YX0KCmRpbShndC5kZikKc3RyKGd0LmRmLCBsaXN0Lmxlbj01KQpndC5kZlsxOjUsMTo1XQoKZGltKGdxLmRmKQpzdHIoZ3EuZGYsIGxpc3QubGVuPTUpCmdxLmRmWzE6NSwxOjVdCgpkaW0oZHAuZGYpCnN0cihkcC5kZiwgbGlzdC5sZW49NSkKZHAuZGZbMTo1LDE6NV0KCmRpbShjb3Zhci5kZikKc3RyKGNvdmFyLmRmKQpjb3Zhci5kZlsxOjUsMTo1XQoKZGltKHNhbXBsZXMuZGYpCnN0cihzYW1wbGVzLmRmKQpzYW1wbGVzLmRmWzE6NSxdCgpkaW0oZGVtb2dyYXBoaWNzLmRmKQpzdHIoZGVtb2dyYXBoaWNzLmRmKQpkZW1vZ3JhcGhpY3MuZGZbMTo1LDE6NV0KCmRpbShwaGVub3R5cGVzX3VwZGF0ZS5kZikKc3RyKHBoZW5vdHlwZXNfdXBkYXRlLmRmKQpwaGVub3R5cGVzX3VwZGF0ZS5kZlsxOjUsMTo1XQoKZGltKEJSQ0ExX0JSQ0EyX1BBTEIyX2Nhc2VzLmRmKQpzdHIoQlJDQTFfQlJDQTJfUEFMQjJfY2FzZXMuZGYpCkJSQ0ExX0JSQ0EyX1BBTEIyX2Nhc2VzLmRmWzE6NSwxOjVdCgpkaW0odnYuZGYpCnN0cih2di5kZikKdnYuZGZbMTo1LDE6NV0KCmRpbShrZ2VuLmRmKQpzdHIoa2dlbi5kZikKa2dlbi5kZlsxOjUsMTo1XQoKZGltKGV4YWMuZGYpCnN0cihleGFjLmRmKQpleGFjLmRmWzE6NSwxOjVdCgpgYGAKCiMgY29udmVydF9kYXRhX2ZyYW1lc190b19tYXRyaWNlcwoKYGBge3IgY29udmVydF9kYXRhX2ZyYW1lc190b19tYXRyaWNlc30KCmd0Lm14IDwtIGFzLm1hdHJpeChndC5kZikKZ3EubXggPC0gYXMubWF0cml4KGdxLmRmKQpkcC5teCA8LSBhcy5tYXRyaXgoZHAuZGYpCgpkaW0oZ3QubXgpCmNsYXNzKGd0Lm14KQpndC5teFsxOjUsMTo1XQoKZGltKGdxLm14KQpjbGFzcyhncS5teCkKZ3EubXhbMTo1LDE6NV0KCmRpbShkcC5teCkKY2xhc3MoZHAubXgpCmRwLm14WzE6NSwxOjVdCgpybShndC5kZiwgZ3EuZGYsIGRwLmRmKQoKYGBgCgojIGNoZWNrX2NvbnNpc3RlbmNlX29mX3Jvd25hbWVzX2FuZF9jb2xuYW1lcwoKYGBge3IgY2hlY2tfY29uc2lzdGVuY2Vfb2Zfcm93bmFtZXNfYW5kX2NvbG5hbWVzfQoKIyByb3duYW1lcwpzdW0ocm93bmFtZXMoZ3QubXgpICE9IHJvd25hbWVzKGdxLm14KSkKc3VtKHJvd25hbWVzKGd0Lm14KSAhPSByb3duYW1lcyhkcC5teCkpCnN1bShyb3duYW1lcyhndC5teCkgIT0gcm93bmFtZXModnYuZGYpKQpzdW0ocm93bmFtZXMoZ3QubXgpICE9IHJvd25hbWVzKGtnZW4uZGYpKQpzdW0ocm93bmFtZXMoZ3QubXgpICE9IHJvd25hbWVzKGV4YWMuZGYpKQoKIyBjb2xuYW1lcwpzdW0oY29sbmFtZXMoZ3QubXgpICE9IGNvbG5hbWVzKGdxLm14KSkKc3VtKGNvbG5hbWVzKGd0Lm14KSAhPSBjb2xuYW1lcyhkcC5teCkpCgpgYGAKCiMgY29udmVydF9ibGFua3NfdG9fTkFzCgpgYGB7ciBjb252ZXJ0X2JsYW5rc190b19OQXN9CgpOQSAtPiB2di5kZlt2di5kZiRFeGlzdGluZ192YXJpYXRpb24gPT0gIiIsICJFeGlzdGluZ192YXJpYXRpb24iXSAjIG5vIGJsYW5rcyBpbiBvdGhlciBmaWVsZHMKTkEgLT4gY292YXIuZGZbY292YXIuZGYgPT0gIiJdICMgMSBjYXNlIGluIGNoZW1vX2hvcm1vbmUgKHJvdyA4NCkKTkEgLT4gZGVtb2dyYXBoaWNzLmRmW2RlbW9ncmFwaGljcy5kZiA9PSAiIl0gIyAyIGNhc2VzIGluIHJlZ2lzdHJ5IGFuZCBvbmUgaW4gY2hlbW9faG9ybW9uZQpOQSAtPiBCUkNBMV9CUkNBMl9QQUxCMl9jYXNlcy5kZltCUkNBMV9CUkNBMl9QQUxCMl9jYXNlcy5kZiA9PSAiIl0gIyA4IGNhc2VzIGluIG5vdGVzCgojIE5vIGJsYW5rcyBpbiBvdGhlciB0YWJsZXMKCmBgYAoKIyByZWZvcm1hdF9waGVub3R5cGVzX3VwZGF0ZQoKYGBge3IgcmVmb3JtYXRfcGhlbm90eXBlc191cGRhdGV9CgojIENvbnZlcnN0IGZhY3RvcnMgdG8gdmVjdG9ycyAodG8gc3ltcGxpZnkgY29tcGFyaXNvbnMpCnN0cihwaGVub3R5cGVzX3VwZGF0ZS5kZikKYXMuY2hhcmFjdGVyKHBoZW5vdHlwZXNfdXBkYXRlLmRmJGd3YXNfaWQpIC0+IHBoZW5vdHlwZXNfdXBkYXRlLmRmJGd3YXNfaWQKYXMuY2hhcmFjdGVyKHBoZW5vdHlwZXNfdXBkYXRlLmRmJHdlc19pZCkgLT4gcGhlbm90eXBlc191cGRhdGUuZGYkd2VzX2lkCnRyaW13cyhhcy5jaGFyYWN0ZXIocGhlbm90eXBlc191cGRhdGUuZGYkaGlzdF9jYXQpKSAtPiBwaGVub3R5cGVzX3VwZGF0ZS5kZiRoaXN0X2NhdAp0cmltd3MoYXMuY2hhcmFjdGVyKHBoZW5vdHlwZXNfdXBkYXRlLmRmJGNoZW1vX2NhdCkpIC0+IHBoZW5vdHlwZXNfdXBkYXRlLmRmJGNoZW1vX2NhdAoKYXMuY2hhcmFjdGVyKHBoZW5vdHlwZXNfdXBkYXRlLmRmJGVyMSkgLT4gcGhlbm90eXBlc191cGRhdGUuZGYkZXIxCmFzLmNoYXJhY3RlcihwaGVub3R5cGVzX3VwZGF0ZS5kZiRwcjEpIC0+IHBoZW5vdHlwZXNfdXBkYXRlLmRmJHByMQphcy5jaGFyYWN0ZXIocGhlbm90eXBlc191cGRhdGUuZGYkaG9ybW9uZSkgLT4gcGhlbm90eXBlc191cGRhdGUuZGYkaG9ybW9uZQoKTkEgLT4gcGhlbm90eXBlc191cGRhdGUuZGZbcGhlbm90eXBlc191cGRhdGUuZGYkZXIxID09ICJtaXNzaW5nIiwgImVyMSJdCk5BIC0+IHBoZW5vdHlwZXNfdXBkYXRlLmRmW3BoZW5vdHlwZXNfdXBkYXRlLmRmJHByMSA9PSAibWlzc2luZyIsICJwcjEiXQpOQSAtPiBwaGVub3R5cGVzX3VwZGF0ZS5kZltwaGVub3R5cGVzX3VwZGF0ZS5kZiRob3Jtb25lID09ICJtaXNzaW5nIiwgImhvcm1vbmUiXQoKYXMubnVtZXJpYyhwaGVub3R5cGVzX3VwZGF0ZS5kZiRlcjEpIC0+IHBoZW5vdHlwZXNfdXBkYXRlLmRmJGVyMQphcy5udW1lcmljKHBoZW5vdHlwZXNfdXBkYXRlLmRmJHByMSkgLT4gcGhlbm90eXBlc191cGRhdGUuZGYkcHIxCmFzLm51bWVyaWMocGhlbm90eXBlc191cGRhdGUuZGYkaG9ybW9uZSkgLT4gcGhlbm90eXBlc191cGRhdGUuZGYkaG9ybW9uZQoKc3RyKHBoZW5vdHlwZXNfdXBkYXRlLmRmKQoKYGBgCgojIHNhdmVfZGF0YQoKYGBge3Igc2F2ZV9kYXRhfQoKc2F2ZS5pbWFnZShwYXN0ZShpbnRlcmltX2RhdGFfZm9sZGVyLCAicjAxX3JlYWRfYW5kX2NsZWFuX2RhdGFfamFuMjAxNy5SRGF0YSIsIHNlcD0iLyIpKQoKYGBgCgojIGZpbmFsX3NlY3Rpb24KCmBgYHtyIGZpbmFsX3NlY3Rpb259CgpscygpCnNlc3Npb25JbmZvKCkKU3lzLnRpbWUoKQoKYGBg